import pandas as pd
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC,LinearSVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
# from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from imblearn.over_sampling import SMOTE
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
data=pd.read_csv('mbti.csv')
data.head()
| type | posts | |
|---|---|---|
| 0 | INFJ | 'http://www.youtube.com/watch?v=qsXHcwe3krw|||... |
| 1 | ENTP | 'I'm finding the lack of me in these posts ver... |
| 2 | INTP | 'Good one _____ https://www.youtube.com/wat... |
| 3 | INTJ | 'Dear INTP, I enjoyed our conversation the o... |
| 4 | ENTJ | 'You're fired.|||That's another silly misconce... |
data.describe(include='all')
| type | posts | |
|---|---|---|
| count | 8675 | 8675 |
| unique | 16 | 8675 |
| top | INFP | 'http://www.youtube.com/watch?v=qsXHcwe3krw|||... |
| freq | 1832 | 1 |
data.describe()
| type | posts | |
|---|---|---|
| count | 8675 | 8675 |
| unique | 16 | 8675 |
| top | INFP | 'http://www.youtube.com/watch?v=qsXHcwe3krw|||... |
| freq | 1832 | 1 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8675 entries, 0 to 8674 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 type 8675 non-null object 1 posts 8675 non-null object dtypes: object(2) memory usage: 135.7+ KB
data.posts[0]
"'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments https://www.youtube.com/watch?v=iz7lE1g4XM4 sportscenter not top ten plays https://www.youtube.com/watch?v=uCdfze1etec pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8 http://www.youtube.com/watch?v=u8ejam5DP3E On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend posted on his facebook before committing suicide the next day. Rest in peace~ http://vimeo.com/22842206|||Hello ENFJ7. Sorry to hear of your distress. It's only natural for a relationship to not be perfection all the time in every moment of existence. Try to figure the hard times as times of growth, as...|||84389 84390 http://wallpaperpassion.com/upload/23700/friendship-boy-and-girl-wallpaper.jpg http://assets.dornob.com/wp-content/uploads/2010/04/round-home-design.jpg ...|||Welcome and stuff.|||http://playeressence.com/wp-content/uploads/2013/08/RED-red-the-pokemon-master-32560474-450-338.jpg Game. Set. Match.|||Prozac, wellbrutin, at least thirty minutes of moving your legs (and I don't mean moving them while sitting in your same desk chair), weed in moderation (maybe try edibles as a healthier alternative...|||Basically come up with three items you've determined that each type (or whichever types you want to do) would more than likely use, given each types' cognitive functions and whatnot, when left by...|||All things in moderation. Sims is indeed a video game, and a good one at that. Note: a good one at that is somewhat subjective in that I am not completely promoting the death of any given Sim...|||Dear ENFP: What were your favorite video games growing up and what are your now, current favorite video games? :cool:|||https://www.youtube.com/watch?v=QyPqT8umzmY|||It appears to be too late. :sad:|||There's someone out there for everyone.|||Wait... I thought confidence was a good thing.|||I just cherish the time of solitude b/c i revel within my inner world more whereas most other time i'd be workin... just enjoy the me time while you can. Don't worry, people will always be around to...|||Yo entp ladies... if you're into a complimentary personality,well, hey.|||... when your main social outlet is xbox live conversations and even then you verbally fatigue quickly.|||http://www.youtube.com/watch?v=gDhy7rdfm14 I really dig the part from 1:46 to 2:50|||http://www.youtube.com/watch?v=msqXffgh7b8|||Banned because this thread requires it of me.|||Get high in backyard, roast and eat marshmellows in backyard while conversing over something intellectual, followed by massages and kisses.|||http://www.youtube.com/watch?v=Mw7eoU3BMbE|||http://www.youtube.com/watch?v=4V2uYORhQOk|||http://www.youtube.com/watch?v=SlVmgFQQ0TI|||Banned for too many b's in that sentence. How could you! Think of the B!|||Banned for watching movies in the corner with the dunces.|||Banned because Health class clearly taught you nothing about peer pressure.|||Banned for a whole host of reasons!|||http://www.youtube.com/watch?v=IRcrv41hgz4|||1) Two baby deer on left and right munching on a beetle in the middle. 2) Using their own blood, two cavemen diary today's latest happenings on their designated cave diary wall. 3) I see it as...|||a pokemon world an infj society everyone becomes an optimist|||49142|||http://www.youtube.com/watch?v=ZRCEq_JFeFM|||http://discovermagazine.com/2012/jul-aug/20-things-you-didnt-know-about-deserts/desert.jpg|||http://oyster.ignimgs.com/mediawiki/apis.ign.com/pokemon-silver-version/d/dd/Ditto.gif|||http://www.serebii.net/potw-dp/Scizor.jpg|||Not all artists are artists because they draw. It's the idea that counts in forming something of your own... like a signature.|||Welcome to the robot ranks, person who downed my self-esteem cuz I'm not an avid signature artist like herself. :proud:|||Banned for taking all the room under my bed. Ya gotta learn to share with the roaches.|||http://www.youtube.com/watch?v=w8IgImn57aQ|||Banned for being too much of a thundering, grumbling kind of storm... yep.|||Ahh... old high school music I haven't heard in ages. http://www.youtube.com/watch?v=dcCRUPCdB1w|||I failed a public speaking class a few years ago and I've sort of learned what I could do better were I to be in that position again. A big part of my failure was just overloading myself with too...|||I like this person's mentality. He's a confirmed INTJ by the way. http://www.youtube.com/watch?v=hGKLI-GEc6M|||Move to the Denver area and start a new life for myself.'"
train_data,test_data=train_test_split(data,test_size=0.2,random_state=42,
stratify=data.type)
Function to clean the text data.
def clear_text(data):
data_length=[]
lemmatizer=WordNetLemmatizer()
cleaned_text=[]
for sentence in tqdm(data.posts):
sentence=sentence.lower()
sentence=re.sub('https?://[^\s<>"]+|www\.[^\s<>"]+','',sentence)
sentence=re.sub('[^0-9a-z]',' ',sentence)
data_length.append(len(sentence.split()))
cleaned_text.append(sentence)
return cleaned_text,data_length
train_data.posts,train_length=clear_text(train_data)
test_data.posts,test_length=clear_text(test_data)
100%|████████████████████████████████████████████████████████████████████████████| 6940/6940 [00:03<00:00, 1939.03it/s] 100%|████████████████████████████████████████████████████████████████████████████| 1735/1735 [00:00<00:00, 1906.13it/s]
train_data.posts[0]
' and intj moments sportscenter not top ten plays pranks what has been the most life changing experience in your life on repeat for most of today may the perc experience immerse you the last thing my infj friend posted on his facebook before committing suicide the next day rest in peace enfj7 sorry to hear of your distress it s only natural for a relationship to not be perfection all the time in every moment of existence try to figure the hard times as times of growth as 84389 84390 welcome and stuff game set match prozac wellbrutin at least thirty minutes of moving your legs and i don t mean moving them while sitting in your same desk chair weed in moderation maybe try edibles as a healthier alternative basically come up with three items you ve determined that each type or whichever types you want to do would more than likely use given each types cognitive functions and whatnot when left by all things in moderation sims is indeed a video game and a good one at that note a good one at that is somewhat subjective in that i am not completely promoting the death of any given sim dear enfp what were your favorite video games growing up and what are your now current favorite video games cool appears to be too late sad there s someone out there for everyone wait i thought confidence was a good thing i just cherish the time of solitude b c i revel within my inner world more whereas most other time i d be workin just enjoy the me time while you can don t worry people will always be around to yo entp ladies if you re into a complimentary personality well hey when your main social outlet is xbox live conversations and even then you verbally fatigue quickly i really dig the part from 1 46 to 2 50 because this thread requires it of me get high in backyard roast and eat marshmellows in backyard while conversing over something intellectual followed by massages and kisses for too many b s in that sentence how could you think of the b banned for watching movies in the corner with the dunces banned because health class clearly taught you nothing about peer pressure banned for a whole host of reasons two baby deer on left and right munching on a beetle in the middle 2 using their own blood two cavemen diary today s latest happenings on their designated cave diary wall 3 i see it as a pokemon world an infj society everyone becomes an optimist 49142 all artists are artists because they draw it s the idea that counts in forming something of your own like a signature welcome to the robot ranks person who downed my self esteem cuz i m not an avid signature artist like herself proud banned for taking all the room under my bed ya gotta learn to share with the roaches for being too much of a thundering grumbling kind of storm yep ahh old high school music i haven t heard in ages failed a public speaking class a few years ago and i ve sort of learned what i could do better were i to be in that position again a big part of my failure was just overloading myself with too i like this person s mentality he s a confirmed intj by the way to the denver area and start a new life for myself '
plt.figure(figsize=(15,10))
sns.distplot(train_length,label='train data word length')
sns.distplot(test_length,label='test data word length')
plt.title('Number of words in text',fontdict={'size':20,
'style':'italic'})
plt.show()
px.pie(train_data,names='type',title='Personality type',hole=0.3)
def Lemmatizer(sentence):
for word in sentence.split():
if len(word)>2:
lemmatizer.lemmatize(word)
def Lemmatizer():
lemmatizer=WordNetLemmatizer()
vectorizer=TfidfVectorizer(max_features=5000,stop_words='english',tokenizer=Lemmatizer())
vectorizer.fit(train_data.posts)
TfidfVectorizer(max_features=5000, stop_words='english',
tokenizer=<__main__.Lemmatizer object at 0x0000021C2F072940>)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. TfidfVectorizer(max_features=5000, stop_words='english',
tokenizer=<__main__.Lemmatizer object at 0x0000021C2F072940>)train_post=vectorizer.transform(train_data.posts).toarray()
test_post=vectorizer.transform(test_data.posts).toarray()
train_post[0]
array([0.05223205, 0. , 0. , ..., 0. , 0. ,
0. ])
target_encoder=LabelEncoder()
train_target=target_encoder.fit_transform(train_data.type)
test_target=target_encoder.fit_transform(test_data.type)
target_encoder.classes_
array(['ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'ESTP',
'INFJ', 'INFP', 'INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP'],
dtype=object)
train_data.type.unique()
array(['INFP', 'ISTP', 'ENFJ', 'ENFP', 'INFJ', 'ESFP', 'INTJ', 'ISFP',
'ISTJ', 'ENTP', 'ENTJ', 'INTP', 'ISFJ', 'ESTP', 'ESFJ', 'ESTJ'],
dtype=object)
train_target
array([ 9, 15, 0, ..., 2, 11, 3])
models_accuracy={}
model_log=LogisticRegression(max_iter=3000,C=0.5,n_jobs=-1)
model_log.fit(train_post,train_target)
LogisticRegression(C=0.5, max_iter=3000, n_jobs=-1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(C=0.5, max_iter=3000, n_jobs=-1)
print('train classification report \n',classification_report(train_target,model_log.predict(train_post),
target_names=target_encoder.inverse_transform(
[i for i in range(16)])))
train classification report
precision recall f1-score support
ENFJ 0.86 0.16 0.27 152
ENFP 0.80 0.65 0.72 540
ENTJ 0.93 0.29 0.44 185
ENTP 0.82 0.66 0.73 548
ESFJ 0.00 0.00 0.00 33
ESFP 0.00 0.00 0.00 38
ESTJ 0.00 0.00 0.00 31
ESTP 1.00 0.04 0.08 71
INFJ 0.74 0.83 0.78 1176
INFP 0.66 0.93 0.77 1466
INTJ 0.73 0.80 0.77 873
INTP 0.69 0.87 0.77 1043
ISFJ 0.89 0.24 0.38 133
ISFP 0.86 0.25 0.39 217
ISTJ 0.86 0.27 0.41 164
ISTP 0.86 0.51 0.64 270
accuracy 0.72 6940
macro avg 0.67 0.41 0.45 6940
weighted avg 0.74 0.72 0.69 6940
print('test classification report \n',
classification_report(test_target,model_log.predict(test_post),
target_names=target_encoder.inverse_transform(
[i for i in range(16)])))
test classification report
precision recall f1-score support
ENFJ 1.00 0.08 0.15 38
ENFP 0.76 0.53 0.62 135
ENTJ 0.75 0.13 0.22 46
ENTP 0.66 0.51 0.58 137
ESFJ 0.00 0.00 0.00 9
ESFP 0.00 0.00 0.00 10
ESTJ 0.00 0.00 0.00 8
ESTP 0.00 0.00 0.00 18
INFJ 0.64 0.71 0.67 294
INFP 0.56 0.88 0.69 366
INTJ 0.61 0.65 0.63 218
INTP 0.67 0.84 0.74 261
ISFJ 0.67 0.12 0.21 33
ISFP 0.85 0.20 0.33 54
ISTJ 0.60 0.07 0.13 41
ISTP 0.71 0.45 0.55 67
accuracy 0.63 1735
macro avg 0.53 0.32 0.34 1735
weighted avg 0.64 0.63 0.59 1735
models_accuracy['logistic regression']=accuracy_score(test_target,
model_log.predict(test_post))
model_linear_svc=LinearSVC(C=0.1)
model_linear_svc.fit(train_post,train_target)
LinearSVC(C=0.1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearSVC(C=0.1)
print('train classification report\n',
classification_report(train_target,model_linear_svc.predict(train_post),
target_names=target_encoder.inverse_transform([i for i in range(16)])))
print('test classification report\n',
classification_report(test_target,model_linear_svc.predict(test_post),
target_names=target_encoder.inverse_transform(
[i for i in range(16)])))
train classification report
precision recall f1-score support
ENFJ 0.89 0.46 0.61 152
ENFP 0.85 0.76 0.80 540
ENTJ 0.92 0.64 0.76 185
ENTP 0.84 0.82 0.83 548
ESFJ 0.91 0.30 0.45 33
ESFP 1.00 0.13 0.23 38
ESTJ 1.00 0.26 0.41 31
ESTP 0.92 0.48 0.63 71
INFJ 0.82 0.86 0.84 1176
INFP 0.77 0.93 0.84 1466
INTJ 0.83 0.85 0.84 873
INTP 0.81 0.90 0.85 1043
ISFJ 0.92 0.67 0.77 133
ISFP 0.89 0.58 0.70 217
ISTJ 0.88 0.66 0.76 164
ISTP 0.90 0.82 0.86 270
accuracy 0.82 6940
macro avg 0.88 0.63 0.70 6940
weighted avg 0.83 0.82 0.82 6940
test classification report
precision recall f1-score support
ENFJ 0.58 0.18 0.28 38
ENFP 0.75 0.59 0.66 135
ENTJ 0.61 0.30 0.41 46
ENTP 0.62 0.55 0.58 137
ESFJ 1.00 0.33 0.50 9
ESFP 0.00 0.00 0.00 10
ESTJ 1.00 0.12 0.22 8
ESTP 0.67 0.33 0.44 18
INFJ 0.68 0.72 0.70 294
INFP 0.62 0.86 0.72 366
INTJ 0.64 0.66 0.65 218
INTP 0.71 0.82 0.76 261
ISFJ 0.59 0.30 0.40 33
ISFP 0.82 0.33 0.47 54
ISTJ 0.81 0.32 0.46 41
ISTP 0.68 0.57 0.62 67
accuracy 0.66 1735
macro avg 0.67 0.44 0.49 1735
weighted avg 0.67 0.66 0.65 1735
models_accuracy['Linear Support Vector Classifier']=accuracy_score(
test_target,model_linear_svc.predict(test_post))
model_svc=SVC()
model_svc.fit(train_post,train_target)
SVC()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC()
print('train classification report\n',
classification_report(train_target,model_svc.predict(train_post),
target_names=target_encoder.inverse_transform(
[i for i in range(16)])))
print('test classification report\n',
classification_report(test_target,model_svc.predict(test_post),
target_names=target_encoder.inverse_transform(
[i for i in range(16)])))
train classification report
precision recall f1-score support
ENFJ 0.97 0.86 0.91 152
ENFP 0.96 0.95 0.95 540
ENTJ 0.99 0.90 0.94 185
ENTP 0.95 0.96 0.95 548
ESFJ 1.00 0.58 0.73 33
ESFP 1.00 0.37 0.54 38
ESTJ 1.00 0.52 0.68 31
ESTP 1.00 0.82 0.90 71
INFJ 0.95 0.97 0.96 1176
INFP 0.93 0.98 0.95 1466
INTJ 0.96 0.96 0.96 873
INTP 0.94 0.97 0.96 1043
ISFJ 1.00 0.89 0.94 133
ISFP 0.97 0.90 0.94 217
ISTJ 0.94 0.92 0.93 164
ISTP 0.97 0.94 0.95 270
accuracy 0.95 6940
macro avg 0.97 0.84 0.89 6940
weighted avg 0.95 0.95 0.95 6940
test classification report
precision recall f1-score support
ENFJ 0.62 0.26 0.37 38
ENFP 0.76 0.56 0.65 135
ENTJ 0.65 0.28 0.39 46
ENTP 0.64 0.53 0.58 137
ESFJ 0.33 0.11 0.17 9
ESFP 0.00 0.00 0.00 10
ESTJ 0.00 0.00 0.00 8
ESTP 0.71 0.28 0.40 18
INFJ 0.66 0.69 0.67 294
INFP 0.59 0.86 0.70 366
INTJ 0.64 0.63 0.64 218
INTP 0.66 0.83 0.73 261
ISFJ 0.82 0.27 0.41 33
ISFP 0.79 0.35 0.49 54
ISTJ 0.79 0.27 0.40 41
ISTP 0.75 0.57 0.64 67
accuracy 0.65 1735
macro avg 0.59 0.41 0.45 1735
weighted avg 0.65 0.65 0.63 1735
models_accuracy['Support Vector Classifier']=accuracy_score(test_target,
model_svc.predict(test_post))
model_multinomial_nb=MultinomialNB()
model_multinomial_nb.fit(train_post,train_target)
MultinomialNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
MultinomialNB()
print('Train Classification Report\n',
classification_report(train_target,
model_multinomial_nb.predict(train_post),
target_names=target_encoder.inverse_transform(
[i for i in range(16)])))
print('Test classification report\n',
classification_report(train_target,model_multinomial_nb.predict(train_post),
target_names=target_encoder.inverse_transform(
[i for i in range(16)])))
Train Classification Report
precision recall f1-score support
ENFJ 0.00 0.00 0.00 152
ENFP 0.89 0.01 0.03 540
ENTJ 0.00 0.00 0.00 185
ENTP 0.91 0.05 0.10 548
ESFJ 0.00 0.00 0.00 33
ESFP 0.00 0.00 0.00 38
ESTJ 0.00 0.00 0.00 31
ESTP 0.00 0.00 0.00 71
INFJ 0.52 0.62 0.56 1176
INFP 0.35 0.94 0.51 1466
INTJ 0.78 0.42 0.54 873
INTP 0.58 0.63 0.61 1043
ISFJ 0.00 0.00 0.00 133
ISFP 0.00 0.00 0.00 217
ISTJ 0.00 0.00 0.00 164
ISTP 0.00 0.00 0.00 270
accuracy 0.46 6940
macro avg 0.25 0.17 0.15 6940
weighted avg 0.49 0.46 0.37 6940
Test classification report
precision recall f1-score support
ENFJ 0.00 0.00 0.00 152
ENFP 0.89 0.01 0.03 540
ENTJ 0.00 0.00 0.00 185
ENTP 0.91 0.05 0.10 548
ESFJ 0.00 0.00 0.00 33
ESFP 0.00 0.00 0.00 38
ESTJ 0.00 0.00 0.00 31
ESTP 0.00 0.00 0.00 71
INFJ 0.52 0.62 0.56 1176
INFP 0.35 0.94 0.51 1466
INTJ 0.78 0.42 0.54 873
INTP 0.58 0.63 0.61 1043
ISFJ 0.00 0.00 0.00 133
ISFP 0.00 0.00 0.00 217
ISTJ 0.00 0.00 0.00 164
ISTP 0.00 0.00 0.00 270
accuracy 0.46 6940
macro avg 0.25 0.17 0.15 6940
weighted avg 0.49 0.46 0.37 6940
models_accuracy['Multinomial Naive Bayes']=accuracy_score(test_target,
model_multinomial_nb.predict(test_post))
model_tree=DecisionTreeClassifier(max_depth=14)
model_tree.fit(train_post,train_target)
DecisionTreeClassifier(max_depth=14)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(max_depth=14)
print('Train Classification report\n',
classification_report(train_target,model_tree.predict(train_post),
target_names=target_encoder.inverse_transform(
[i for i in range(16)])))
print('Test Classification report\n',
classification_report(test_target,model_tree.predict(test_post),
target_names=target_encoder.inverse_transform(
[i for i in range(16)])))
Train Classification report
precision recall f1-score support
ENFJ 0.77 0.57 0.65 152
ENFP 0.87 0.81 0.84 540
ENTJ 0.86 0.67 0.75 185
ENTP 0.94 0.76 0.84 548
ESFJ 1.00 0.39 0.57 33
ESFP 0.91 0.26 0.41 38
ESTJ 0.85 0.35 0.50 31
ESTP 0.83 0.42 0.56 71
INFJ 0.80 0.86 0.83 1176
INFP 0.63 0.93 0.75 1466
INTJ 0.87 0.77 0.82 873
INTP 0.88 0.82 0.85 1043
ISFJ 0.98 0.40 0.57 133
ISFP 0.97 0.64 0.77 217
ISTJ 0.96 0.55 0.70 164
ISTP 0.98 0.69 0.81 270
accuracy 0.79 6940
macro avg 0.88 0.62 0.70 6940
weighted avg 0.82 0.79 0.79 6940
Test Classification report
precision recall f1-score support
ENFJ 0.15 0.11 0.12 38
ENFP 0.51 0.52 0.51 135
ENTJ 0.25 0.22 0.23 46
ENTP 0.51 0.41 0.46 137
ESFJ 0.00 0.00 0.00 9
ESFP 0.00 0.00 0.00 10
ESTJ 0.00 0.00 0.00 8
ESTP 0.09 0.06 0.07 18
INFJ 0.55 0.62 0.58 294
INFP 0.48 0.67 0.56 366
INTJ 0.53 0.49 0.51 218
INTP 0.62 0.56 0.59 261
ISFJ 0.12 0.06 0.08 33
ISFP 0.54 0.35 0.43 54
ISTJ 0.45 0.24 0.32 41
ISTP 0.56 0.43 0.49 67
accuracy 0.51 1735
macro avg 0.34 0.30 0.31 1735
weighted avg 0.50 0.51 0.50 1735
model_forest=RandomForestClassifier(max_depth=10)
model_forest.fit(train_post,train_target)
RandomForestClassifier(max_depth=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(max_depth=10)
print('Train Classification report\n',
classification_report(train_target,model_forest.predict(train_post),
target_names=target_encoder.inverse_transform(
[i for i in range(16)])))
print('Test Classification report\n',
classification_report(test_target,model_forest.predict(test_post),
target_names=target_encoder.inverse_transform(
[i for i in range(16)])))
Train Classification report
precision recall f1-score support
ENFJ 0.00 0.00 0.00 152
ENFP 1.00 0.26 0.41 540
ENTJ 1.00 0.06 0.11 185
ENTP 1.00 0.47 0.64 548
ESFJ 0.00 0.00 0.00 33
ESFP 0.00 0.00 0.00 38
ESTJ 0.00 0.00 0.00 31
ESTP 1.00 0.01 0.03 71
INFJ 0.84 0.82 0.83 1176
INFP 0.43 1.00 0.60 1466
INTJ 0.86 0.77 0.81 873
INTP 0.81 0.88 0.84 1043
ISFJ 1.00 0.04 0.07 133
ISFP 1.00 0.04 0.08 217
ISTJ 1.00 0.02 0.05 164
ISTP 1.00 0.16 0.28 270
accuracy 0.65 6940
macro avg 0.68 0.28 0.30 6940
weighted avg 0.77 0.65 0.60 6940
Test Classification report
precision recall f1-score support
ENFJ 0.00 0.00 0.00 38
ENFP 0.80 0.03 0.06 135
ENTJ 0.00 0.00 0.00 46
ENTP 0.83 0.11 0.19 137
ESFJ 0.00 0.00 0.00 9
ESFP 0.00 0.00 0.00 10
ESTJ 0.00 0.00 0.00 8
ESTP 0.00 0.00 0.00 18
INFJ 0.60 0.49 0.54 294
INFP 0.33 0.95 0.49 366
INTJ 0.63 0.47 0.54 218
INTP 0.59 0.57 0.58 261
ISFJ 0.00 0.00 0.00 33
ISFP 0.00 0.00 0.00 54
ISTJ 0.00 0.00 0.00 41
ISTP 0.00 0.00 0.00 67
accuracy 0.44 1735
macro avg 0.24 0.16 0.15 1735
weighted avg 0.47 0.44 0.37 1735
models_accuracy['Random Forest Classifier']=accuracy_score(test_target,
model_forest.predict(test_post))
model_xgb=XGBClassifier(gpu_id=0,
tree_method='gpu_hist',
max_depth=5,n_estimators=50,
learning_rate=0.1)
model_xgb.fit(train_post,train_target)
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, gamma=0, gpu_id=0, grow_policy='depthwise',
importance_type=None, interaction_constraints='',
learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
missing=nan, monotone_constraints='()', n_estimators=50, n_jobs=0,
num_parallel_tree=1, objective='multi:softprob', predictor='auto',
random_state=0, reg_alpha=0, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, gamma=0, gpu_id=0, grow_policy='depthwise',
importance_type=None, interaction_constraints='',
learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
missing=nan, monotone_constraints='()', n_estimators=50, n_jobs=0,
num_parallel_tree=1, objective='multi:softprob', predictor='auto',
random_state=0, reg_alpha=0, ...)print('Train Classification report\n',
classification_report(train_target,model_xgb.predict(train_post),
target_names=target_encoder.inverse_transform(
[i for i in range(16)])))
print('Test Classification report\n',
classification_report(test_target,model_xgb.predict(test_post),
target_names=target_encoder.inverse_transform(
[i for i in range(16)])))
Train Classification report
precision recall f1-score support
ENFJ 0.99 0.93 0.96 152
ENFP 0.94 0.91 0.92 540
ENTJ 0.99 0.91 0.95 185
ENTP 0.94 0.91 0.92 548
ESFJ 1.00 0.85 0.92 33
ESFP 1.00 0.92 0.96 38
ESTJ 1.00 0.84 0.91 31
ESTP 1.00 0.94 0.97 71
INFJ 0.91 0.90 0.91 1176
INFP 0.89 0.95 0.92 1466
INTJ 0.92 0.92 0.92 873
INTP 0.90 0.92 0.91 1043
ISFJ 1.00 0.95 0.98 133
ISFP 0.98 0.91 0.94 217
ISTJ 0.99 0.93 0.96 164
ISTP 0.97 0.97 0.97 270
accuracy 0.92 6940
macro avg 0.96 0.92 0.94 6940
weighted avg 0.92 0.92 0.92 6940
Test Classification report
precision recall f1-score support
ENFJ 0.68 0.39 0.50 38
ENFP 0.71 0.64 0.67 135
ENTJ 0.65 0.37 0.47 46
ENTP 0.58 0.58 0.58 137
ESFJ 0.00 0.00 0.00 9
ESFP 1.00 0.10 0.18 10
ESTJ 1.00 0.12 0.22 8
ESTP 0.50 0.28 0.36 18
INFJ 0.68 0.76 0.72 294
INFP 0.67 0.81 0.73 366
INTJ 0.69 0.65 0.67 218
INTP 0.70 0.78 0.74 261
ISFJ 0.67 0.55 0.60 33
ISFP 0.65 0.41 0.50 54
ISTJ 0.71 0.41 0.52 41
ISTP 0.68 0.64 0.66 67
accuracy 0.67 1735
macro avg 0.66 0.47 0.51 1735
weighted avg 0.67 0.67 0.66 1735
models_accuracy['XGBoost Classifier']=accuracy_score(test_target,model_xgb.predict(test_post))
model_cat=CatBoostClassifier(loss_function='MultiClass',eval_metric='MultiClass',
task_type='GPU',verbose=False)
model_cat.fit(train_post,train_target)
Warning: less than 75% gpu memory available for training. Free: 2739.700001 Total: 4095.6875
<catboost.core.CatBoostClassifier at 0x21c404bb9a0>
print('Train Classification report\n',
classification_report(train_target,model_cat.predict(train_post),
target_names=target_encoder.inverse_transform(
[i for i in range(16)])))
print('Test Classification report\n',
classification_report(test_target,model_cat.predict(test_post),
target_names=target_encoder.inverse_transform(
[i for i in range(16)])))
Train Classification report
precision recall f1-score support
ENFJ 0.88 0.62 0.73 152
ENFP 0.84 0.79 0.81 540
ENTJ 0.89 0.66 0.76 185
ENTP 0.82 0.80 0.81 548
ESFJ 0.94 0.52 0.67 33
ESFP 1.00 0.34 0.51 38
ESTJ 1.00 0.39 0.56 31
ESTP 0.92 0.63 0.75 71
INFJ 0.82 0.86 0.84 1176
INFP 0.80 0.90 0.85 1466
INTJ 0.82 0.82 0.82 873
INTP 0.79 0.88 0.83 1043
ISFJ 0.92 0.70 0.79 133
ISFP 0.86 0.67 0.76 217
ISTJ 0.90 0.73 0.81 164
ISTP 0.88 0.80 0.83 270
accuracy 0.82 6940
macro avg 0.88 0.69 0.76 6940
weighted avg 0.83 0.82 0.82 6940
Test Classification report
precision recall f1-score support
ENFJ 0.64 0.37 0.47 38
ENFP 0.71 0.65 0.68 135
ENTJ 0.80 0.43 0.56 46
ENTP 0.60 0.59 0.59 137
ESFJ 0.00 0.00 0.00 9
ESFP 0.00 0.00 0.00 10
ESTJ 1.00 0.25 0.40 8
ESTP 0.83 0.56 0.67 18
INFJ 0.70 0.73 0.71 294
INFP 0.67 0.79 0.72 366
INTJ 0.65 0.67 0.66 218
INTP 0.68 0.77 0.72 261
ISFJ 0.70 0.58 0.63 33
ISFP 0.61 0.43 0.50 54
ISTJ 0.80 0.39 0.52 41
ISTP 0.68 0.69 0.68 67
accuracy 0.67 1735
macro avg 0.63 0.49 0.53 1735
weighted avg 0.67 0.67 0.66 1735
models_accuracy['CatBoost Classifier']=accuracy_score(test_target,
model_cat.predict(test_post))
models_accuracy
{'logistic regression': 0.6282420749279539,
'Linear Support Vector Classifier': 0.6628242074927954,
'Support Vector Classifier': 0.6478386167146974,
'Multinomial Naive Bayes': 0.37809798270893374,
'Random Forest Classifier': 0.4386167146974063,
'XGBoost Classifier': 0.6743515850144092,
'CatBoost Classifier': 0.6731988472622479}
def clear_text1(data):
data_length=[]
lemmatizer=WordNetLemmatizer()
cleaned_text=[]
for sentence in tqdm(data):
sentence=sentence.lower()
sentence=re.sub('https?://[^\s<>"]+|www\.[^\s<>"]+','',sentence)
sentence=re.sub('[^0-9a-z]',' ',sentence)
data_length.append(len(sentence.split()))
cleaned_text.append(sentence)
return cleaned_text
text_data
['hi i am very good boy staying alone being happpy ']
text=["Hi, I am very good boy staying alone being happpy. i like to stay happy making friend enjoying party"]
text_data=clear_text1(text)
text_v=vectorizer.transform(text_data).toarray()
100%|████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]
text_v
array([[0., 0., 0., ..., 0., 0., 0.]])
a=model_cat.predict(text_v)
a[0][0]
9
a
array([[9]], dtype=int64)
target_encoder.classes_[a[0][0]]
'INFP'